Executing R from Jupyter notebook with Python kernel!

In [ ]:
% load_ext rpy2.ipython
import rpy2.robjects as robj
In [37]:
%%R
N_iter=20
y=rep(NA,N_iter)

set.seed(123)

for(i in 1:N_iter){
cat(i,"\r")
rands=rnorm(2^i)
y[i]=max(rands)
}
1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 
17 
18 
19 
20 
In [38]:
%%R

plot(1:N_iter, y, type="l")
In [36]:
%%R
grades=read.csv("grades.csv")
grades$X=NULL
head(grades)

meangrades=c()

for (i in 1:nrow(grades)){
meangrades[i]=mean(as.matrix(grades[i,2:ncol(grades)]))
}

head(meangrades)
[1] 49.25 59.00 44.00 50.00 55.75 56.75
In [17]:
%%R
meangrades = apply (grades[,2:ncol(grades)],1,mean)
head(meangrades)
[1] 49.25 59.00 44.00 50.00 55.75 56.75

Passing objects from Python to R and vice versa

In [186]:
import rpy2.robjects as robj
from rpy2.robjects import r # R instance
import numpy as np
import pandas as pd

To get R object Just subscribe the R instance

In [32]:
r["meangrades"]
Out[32]:
FloatVector with 200 elements.
49.250000 59.000000 44.000000 50.000000 ... 41.500000 51.250000 59.750000 61.000000
In [33]:
type(r["meangrades"])
Out[33]:
rpy2.robjects.vectors.FloatVector
In [42]:
meangrades_np = np.array(r["meangrades"])
meangrades_np[:10]
Out[42]:
array([49.25, 59.  , 44.  , 50.  , 55.75, 56.75, 53.75, 41.5 , 55.  ,
       52.  ])

To pass Python object to R use constructors

In [62]:
r["meangrades_back"]
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
<ipython-input-62-d770a250cbb4> in <module>()
----> 1 r["meangrades_back"]

/usr/local/lib/python3.6/dist-packages/rpy2/robjects/__init__.py in __getitem__(self, item)
    329 
    330     def __getitem__(self, item):
--> 331         res = _globalenv.get(item)
    332         res = conversion.ri2py(res)
    333         if hasattr(res, '__rname__'):

LookupError: 'meangrades_back' not found
In [67]:
meangrades_back = robj.Vector(meangrades_np)
_ = r(f"meangrades_back = {meangrades_back.r_repr()}")
r["meangrades_back"][:10]
Out[67]:
array([49.25, 59.  , 44.  , 50.  , 55.75, 56.75, 53.75, 41.5 , 55.  ,
       52.  ])

Pandas has native binding

In [51]:
from rpy2.robjects import pandas2ri
pandas2ri.activate()
In [54]:
grades = pd.read_csv("grades.csv", index_col=0)
grades.head()
Out[54]:
id write math science socst
1 70 52 41 47 57
2 121 59 53 63 61
3 86 33 54 58 31
4 141 44 47 53 56
5 172 52 57 53 61
In [57]:
r_dataframe = pandas2ri.py2ri(grades)
_ = r(f"grades_back = {r_dataframe.r_repr()}")
In [60]:
r["grades_back"].head()
Out[60]:
id write math science socst
0 70 52 41 47 57
1 121 59 53 63 61
2 86 33 54 58 31
3 141 44 47 53 56
4 172 52 57 53 61

R if

In [187]:
%%R
grades$Filt=NA
head(grades)
   id write math science socst Filt
1  70    52   41      47    57   NA
2 121    59   53      63    61   NA
3  86    33   54      58    31   NA
4 141    44   47      53    56   NA
5 172    52   57      53    61   NA
6 113    52   51      63    61   NA
In [68]:
%%R
for (i in 1:nrow(grades)){
if (grades$write[i] > 50){
grades$Filt[i] = "A"
}
else if (grades$write[i] > 40 & grades$write[i] <= 50){
grades$Filt[i] = "B"
}
else {
grades$Filt[i] = "C"
}
}

head(grades)
   id write math science socst Filt
1  70    52   41      47    57    A
2 121    59   53      63    61    A
3  86    33   54      58    31    C
4 141    44   47      53    56    B
5 172    52   57      53    61    A
6 113    52   51      63    61    A
In [70]:
%%R
x = c(-1,4,-5,2,7)
x

ifelse(x > 0, "pos","neg")
[1] "neg" "pos" "neg" "pos" "pos"

R functions

In [135]:
%%R
My_Func = function (x, y=2){
x ^ y
}
In [137]:
%%R
My_Func(x)
[1]  1 16 25  4 49
In [138]:
%%R
My_Func(2)
[1] 4
In [139]:
%%R
My_Func(2,4)
[1] 16

Lapply and sapply

In [140]:
%%R
l = list(1, c(1,2,3), c(3,4))
l
[[1]]
[1] 1

[[2]]
[1] 1 2 3

[[3]]
[1] 3 4

In [141]:
%%R
lapply(l, sum)
[[1]]
[1] 1

[[2]]
[1] 6

[[3]]
[1] 7

In [142]:
%%R
sapply(l,sum)
[1] 1 6 7
In [143]:
%%R
lapply(l, function(x){return(c(min(x), max(x)))})
[[1]]
[1] 1 1

[[2]]
[1] 1 3

[[3]]
[1] 3 4

In [144]:
%%R
sapply(l, function(x){return(c(min(x), max(x)))})
     [,1] [,2] [,3]
[1,]    1    1    3
[2,]    1    3    4

Installing and importing packages in Rpy2

In [75]:
# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list
Out[75]:
rpy2.rinterface.NULL
In [77]:
# R package names
packnames = ('reshape2', 'ggplot2')

# R vector of strings
from rpy2.robjects.vectors import StrVector

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))
In [80]:
reshape = rpackages.importr('reshape2')

R melt and cast - helpful for subsequent plotting and modelling

title

In [110]:
%%R
a=data.frame(name=c('John', 'Mary', 'Peter', 'Susan'),
sex=c('m','f','m','f'),
age=c(26,21,19,29),
weight=c(82, 56, 79, 60),
height=c(182, 171, 179, 175))
a
   name sex age weight height
1  John   m  26     82    182
2  Mary   f  21     56    171
3 Peter   m  19     79    179
4 Susan   f  29     60    175
In [111]:
%%R
a_melt = melt (a, id.vars = c('name', 'sex'), 
               variable_name = "a_var", value.name = 'a_name')
a_melt
    name sex variable a_name
1   John   m      age     26
2   Mary   f      age     21
3  Peter   m      age     19
4  Susan   f      age     29
5   John   m   weight     82
6   Mary   f   weight     56
7  Peter   m   weight     79
8  Susan   f   weight     60
9   John   m   height    182
10  Mary   f   height    171
11 Peter   m   height    179
12 Susan   f   height    175

Cast == invert melt

In [113]:
%%R
dcast(a_melt, name ~ a_var)

dcast(a_melt, name + sex ~ a_var)
Error in FUN(X[[i]], ...) : object 'a_var' not found

Juggling data using Pandas

Melt and pivot

In [91]:
df = r["a"]
type(df) # pandas2ri converted it automatically
Out[91]:
pandas.core.frame.DataFrame
In [92]:
df
Out[92]:
name sex age weight height
0 John m 26.0 82.0 182.0
1 Mary f 21.0 56.0 171.0
2 Peter m 19.0 79.0 179.0
3 Susan f 29.0 60.0 175.0
In [121]:
molten = df.melt(id_vars=["name", "sex"], 
                 var_name="a_var", value_name="a_name")
molten
Out[121]:
name sex a_var a_name
0 John m age 26.0
1 Mary f age 21.0
2 Peter m age 19.0
3 Susan f age 29.0
4 John m weight 82.0
5 Mary f weight 56.0
6 Peter m weight 79.0
7 Susan f weight 60.0
8 John m height 182.0
9 Mary f height 171.0
10 Peter m height 179.0
11 Susan f height 175.0
In [133]:
molten.pivot(columns="a_var", values="a_name", index="name")
Out[133]:
a_var age height weight
name
John 26.0 182.0 82.0
Mary 21.0 171.0 56.0
Peter 19.0 179.0 79.0
Susan 29.0 175.0 60.0

GroupBy object in Pandas

In [146]:
trials = pd.read_hdf("nrdd_rephub_targets.hdf")
trials.sample(3)
Out[146]:
Name Indication Phase Therapeutic categories rdkit_smiles Targets CID
5442 edaglitazone non-insulin dependent diabetes discontinued {ENDOCRINE DRUGS} Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC2SC(=O)NC2=O)c2s... NaN 9825701
7117 modafinil shift work disorder (swd) pre-registration and above {NEUROLOGIC DRUGS, PSYCHOPHARMACOLOGIC DRUGS, ... NC(=O)CS(=O)C(c1ccccc1)c1ccccc1 {CYP2D6, CYP2C19, CYP3A4, Slc6a3, PTGS2, SLC6A... 4236
2600 il-16 asthma discontinued {ANTIINFECTIVE THERAPY} NN1C(=O)CC(c2cccc(Br)c2)C1=O NaN 125225

title

In [148]:
trials_gby = trials.groupby("Phase")
Out[148]:
<pandas.core.groupby.DataFrameGroupBy object at 0x7fb19d436c88>
In [165]:
sizes = trials_gby.size().sort_values()
sizes
Out[165]:
Phase
phase 1 clinical               123
discovery                      135
phase 3 clinical               208
phase 2 clinical               421
pre-registration and above    2961
discontinued                  3434
dtype: int64
In [191]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1.6)
In [192]:
sns.barplot(sizes.index, sizes.values)
plt.xticks(rotation=45)
Out[192]:
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)

.apply - get one value for each group

In [173]:
def size_apply(df):
    return len(df)
In [174]:
trials_gby.apply(size_apply)
Out[174]:
Phase
discontinued                  3434
discovery                      135
phase 1 clinical               123
phase 2 clinical               421
phase 3 clinical               208
pre-registration and above    2961
dtype: int64

.agg - get one value for each column in each group

In [179]:
def size_agg(ser):
    return len(ser)
In [180]:
trials_gby.agg(size_agg)
Out[180]:
Name Indication Therapeutic categories rdkit_smiles Targets CID
Phase
discontinued 3434 3434 3434 3434 3434 3434
discovery 135 135 135 135 135 135
phase 1 clinical 123 123 123 123 123 123
phase 2 clinical 421 421 421 421 421 421
phase 3 clinical 208 208 208 208 208 208
pre-registration and above 2961 2961 2961 2961 2961 2961
In [183]:
from random import choice
grades = pd.read_csv("grades.csv", index_col=0)
grades["favourite_color"] = [choice(["blue", "red", 
                                    "green", "hazelnut"])
                             for _ in grades.index]
In [185]:
grades_gby = grades.groupby("favourite_color")
grades_gby.agg(np.mean)
Out[185]:
id write math science socst
favourite_color
blue 98.294118 50.784314 51.529412 49.294118 52.490196
green 100.810345 54.586207 53.931034 52.137931 52.603448
hazelnut 105.794872 51.128205 51.897436 53.282051 51.589744
red 98.346154 53.942308 52.865385 52.961538 52.711538